/*****************************************************************************
 * This file is part of Kvazaar HEVC encoder.
 *
 * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 * 
 * * Redistributions of source code must retain the above copyright notice, this
 *   list of conditions and the following disclaimer.
 * 
 * * Redistributions in binary form must reproduce the above copyright notice, this
 *   list of conditions and the following disclaimer in the documentation and/or
 *   other materials provided with the distribution.
 * 
 * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
 *   contributors may be used to endorse or promote products derived from
 *   this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
 ****************************************************************************/

#include "intra.h"

#include <stdlib.h>

#include "image.h"
#include "kvz_math.h"
#include "strategies/strategies-intra.h"
#include "tables.h"
#include "transform.h"
#include "videoframe.h"

// Tables for looking up the number of intra reference pixels based on
// prediction units coordinate within an LCU.
// generated by "tools/generate_ref_pixel_tables.py".
static const uint8_t num_ref_pixels_top[16][16] = {
  { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
  {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
  { 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4 },
  {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
  { 32, 28, 24, 20, 16, 12,  8,  4, 32, 28, 24, 20, 16, 12,  8,  4 },
  {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
  { 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4 },
  {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
  { 64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12,  8,  4 },
  {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
  { 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4 },
  {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
  { 32, 28, 24, 20, 16, 12,  8,  4, 32, 28, 24, 20, 16, 12,  8,  4 },
  {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
  { 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4 },
  {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 }
};
static const uint8_t num_ref_pixels_left[16][16] = {
  { 64,  4,  8,  4, 16,  4,  8,  4, 32,  4,  8,  4, 16,  4,  8,  4 },
  { 60,  4,  4,  4, 12,  4,  4,  4, 28,  4,  4,  4, 12,  4,  4,  4 },
  { 56,  4,  8,  4,  8,  4,  8,  4, 24,  4,  8,  4,  8,  4,  8,  4 },
  { 52,  4,  4,  4,  4,  4,  4,  4, 20,  4,  4,  4,  4,  4,  4,  4 },
  { 48,  4,  8,  4, 16,  4,  8,  4, 16,  4,  8,  4, 16,  4,  8,  4 },
  { 44,  4,  4,  4, 12,  4,  4,  4, 12,  4,  4,  4, 12,  4,  4,  4 },
  { 40,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
  { 36,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4 },
  { 32,  4,  8,  4, 16,  4,  8,  4, 32,  4,  8,  4, 16,  4,  8,  4 },
  { 28,  4,  4,  4, 12,  4,  4,  4, 28,  4,  4,  4, 12,  4,  4,  4 },
  { 24,  4,  8,  4,  8,  4,  8,  4, 24,  4,  8,  4,  8,  4,  8,  4 },
  { 20,  4,  4,  4,  4,  4,  4,  4, 20,  4,  4,  4,  4,  4,  4,  4 },
  { 16,  4,  8,  4, 16,  4,  8,  4, 16,  4,  8,  4, 16,  4,  8,  4 },
  { 12,  4,  4,  4, 12,  4,  4,  4, 12,  4,  4,  4, 12,  4,  4,  4 },
  { 8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
  { 4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4 }
};

int8_t kvz_intra_get_dir_luma_predictor(
  const uint32_t x,
  const uint32_t y,
  int8_t *preds,
  const cu_info_t *const cur_pu,
  const cu_info_t *const left_pu,
  const cu_info_t *const above_pu)
{
  // The default mode if block is not coded yet is INTRA_DC.
  int8_t left_intra_dir  = 1;
  if (left_pu && left_pu->type == CU_INTRA) {
    left_intra_dir = left_pu->intra.mode;
  }

  int8_t above_intra_dir = 1;
  if (above_pu && above_pu->type == CU_INTRA && y % LCU_WIDTH != 0) {
    above_intra_dir = above_pu->intra.mode;
  }

  // If the predictions are the same, add new predictions
  if (left_intra_dir == above_intra_dir) {
    if (left_intra_dir > 1) { // angular modes
      preds[0] = left_intra_dir;
      preds[1] = ((left_intra_dir + 29) % 32) + 2;
      preds[2] = ((left_intra_dir - 1 ) % 32) + 2;
    } else { //non-angular
      preds[0] = 0;//PLANAR_IDX;
      preds[1] = 1;//DC_IDX;
      preds[2] = 26;//VER_IDX;
    }
  } else { // If we have two distinct predictions
    preds[0] = left_intra_dir;
    preds[1] = above_intra_dir;

    // add planar mode if it's not yet present
    if (left_intra_dir && above_intra_dir ) {
      preds[2] = 0; // PLANAR_IDX;
    } else {  // Add DC mode if it's not present, otherwise 26.
      preds[2] =  (left_intra_dir+above_intra_dir)<2? 26 : 1;
    }
  }

  return 1;
}

#if KVZ_SEL_ENCRYPTION
int8_t kvz_intra_get_dir_luma_predictor_encry(
  const uint32_t x,
  const uint32_t y,
  int8_t *preds,
  const cu_info_t *const cur_pu,
  const cu_info_t *const left_pu,
  const cu_info_t *const above_pu)
{
  // The default mode if block is not coded yet is INTRA_DC.
  int8_t left_intra_dir  = 1;
  if (left_pu && left_pu->type == CU_INTRA) {
    left_intra_dir = left_pu->intra.mode_encry ;
  }

  int8_t above_intra_dir = 1;
  if (above_pu && above_pu->type == CU_INTRA && y % LCU_WIDTH != 0) {
    above_intra_dir = above_pu->intra.mode_encry;
  }

  // If the predictions are the same, add new predictions
  if (left_intra_dir == above_intra_dir) {
    if (left_intra_dir > 1) { // angular modes
      preds[0] = left_intra_dir;
      preds[1] = ((left_intra_dir + 29) % 32) + 2;
      preds[2] = ((left_intra_dir - 1 ) % 32) + 2;
    } else { //non-angular
      preds[0] = 0;//PLANAR_IDX;
      preds[1] = 1;//DC_IDX;
      preds[2] = 26;//VER_IDX;
    }
  } else { // If we have two distinct predictions
    preds[0] = left_intra_dir;
    preds[1] = above_intra_dir;

    // add planar mode if it's not yet present
    if (left_intra_dir && above_intra_dir ) {
      preds[2] = 0; // PLANAR_IDX;
    } else {  // Add DC mode if it's not present, otherwise 26.
      preds[2] =  (left_intra_dir+above_intra_dir)<2? 26 : 1;
    }
  }

  return 1;
}
#endif

static void intra_filter_reference(
  int_fast8_t log2_width,
  kvz_intra_references *refs)
{
  if (refs->filtered_initialized) {
    return;
  } else {
    refs->filtered_initialized = true;
  }

  const int_fast8_t ref_width = 2 * (1 << log2_width) + 1;
  kvz_intra_ref *ref = &refs->ref;
  kvz_intra_ref *filtered_ref = &refs->filtered_ref;

  filtered_ref->left[0] = (ref->left[1] + 2 * ref->left[0] + ref->top[1] + 2) / 4;
  filtered_ref->top[0] = filtered_ref->left[0];

  for (int_fast8_t y = 1; y < ref_width - 1; ++y) {
    kvz_pixel *p = &ref->left[y];
    filtered_ref->left[y] = (p[-1] + 2 * p[0] + p[1] + 2) / 4;
  }
  filtered_ref->left[ref_width - 1] = ref->left[ref_width - 1];

  for (int_fast8_t x = 1; x < ref_width - 1; ++x) {
    kvz_pixel *p = &ref->top[x];
    filtered_ref->top[x] = (p[-1] + 2 * p[0] + p[1] + 2) / 4;
  }
  filtered_ref->top[ref_width - 1] = ref->top[ref_width - 1];
}


static void intra_post_process_angular(
  unsigned width,
  unsigned stride,
  const kvz_pixel *ref,
  kvz_pixel *block)
{
  kvz_pixel ref2 = ref[0];
  for (unsigned i = 0; i < width; i++) {
    kvz_pixel val = block[i * stride];
    kvz_pixel ref1 = ref[i + 1];
    block[i * stride] = CLIP_TO_PIXEL(val + ((ref1 - ref2) >> 1));
  }
}


/**
* \brief Generage planar prediction.
* \param log2_width    Log2 of width, range 2..5.
* \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
* \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
* \param dst           Buffer of size width*width.
*/
static void intra_pred_dc(
  const int_fast8_t log2_width,
  const kvz_pixel *const ref_top,
  const kvz_pixel *const ref_left,
  kvz_pixel *const out_block)
{
  int_fast8_t width = 1 << log2_width;

  int_fast16_t sum = 0;
  for (int_fast8_t i = 0; i < width; ++i) {
    sum += ref_top[i + 1];
    sum += ref_left[i + 1];
  }

  const kvz_pixel dc_val = (sum + width) >> (log2_width + 1);
  const int_fast16_t block_size = 1 << (log2_width * 2);

  for (int_fast16_t i = 0; i < block_size; ++i) {
    out_block[i] = dc_val;
  }
}


void kvz_intra_predict(
  kvz_intra_references *refs,
  int_fast8_t log2_width,
  int_fast8_t mode,
  color_t color,
  kvz_pixel *dst,
  bool filter_boundary)
{
  const int_fast8_t width = 1 << log2_width;

  const kvz_intra_ref *used_ref = &refs->ref;
  if (color != COLOR_Y || mode == 1 || width == 4) {
    // For chroma, DC and 4x4 blocks, always use unfiltered reference.
  } else if (mode == 0) {
    // Otherwise, use filtered for planar.
    used_ref = &refs->filtered_ref;
  } else {
    // Angular modes use smoothed reference pixels, unless the mode is close
    // to being either vertical or horizontal.
    static const int kvz_intra_hor_ver_dist_thres[5] = { 0, 7, 1, 0, 0 };
    int filter_threshold = kvz_intra_hor_ver_dist_thres[kvz_math_floor_log2(width) - 2];
    int dist_from_vert_or_hor = MIN(abs(mode - 26), abs(mode - 10));
    if (dist_from_vert_or_hor > filter_threshold) {
      used_ref = &refs->filtered_ref;
    }
  }

  if (used_ref == &refs->filtered_ref && !refs->filtered_initialized) {
    intra_filter_reference(log2_width, refs);
  }

  if (mode == 0) {
    kvz_intra_pred_planar(log2_width, used_ref->top, used_ref->left, dst);
  } else if (mode == 1) {
    // Do extra post filtering for edge pixels of luma DC mode.
    if (color == COLOR_Y && width < 32) {
      kvz_intra_pred_filtered_dc(log2_width, used_ref->top, used_ref->left, dst);
    } else {
      intra_pred_dc(log2_width, used_ref->top, used_ref->left, dst);
    }
  } else {
    kvz_angular_pred(log2_width, mode, used_ref->top, used_ref->left, dst);
    if (color == COLOR_Y && width < 32 && filter_boundary) {
      if (mode == 10) {
        intra_post_process_angular(width, 1, used_ref->top, dst);
      } else if (mode == 26) {
        intra_post_process_angular(width, width, used_ref->left, dst);
      }
    }
  }
}


void kvz_intra_build_reference_any(
  const int_fast8_t log2_width,
  const color_t color,
  const vector2d_t *const luma_px,
  const vector2d_t *const pic_px,
  const lcu_t *const lcu,
  kvz_intra_references *const refs)
{
  assert(log2_width >= 2 && log2_width <= 5);

  refs->filtered_initialized = false;
  kvz_pixel *out_left_ref = &refs->ref.left[0];
  kvz_pixel *out_top_ref = &refs->ref.top[0];

  const kvz_pixel dc_val = 1 << (KVZ_BIT_DEPTH - 1);
  const int is_chroma = color != COLOR_Y ? 1 : 0;
  const int_fast8_t width = 1 << log2_width;

  // Convert luma coordinates to chroma coordinates for chroma.
  const vector2d_t lcu_px = {
    luma_px->x % LCU_WIDTH,
    luma_px->y % LCU_WIDTH
  };
  const vector2d_t px = {
    lcu_px.x >> is_chroma,
    lcu_px.y >> is_chroma,
  };

  // Init pointers to LCUs reconstruction buffers, such that index 0 refers to block coordinate 0.
  const kvz_pixel *left_ref = !color ? &lcu->left_ref.y[1] : (color == 1) ? &lcu->left_ref.u[1] : &lcu->left_ref.v[1];
  const kvz_pixel *top_ref = !color ? &lcu->top_ref.y[1] : (color == 1) ? &lcu->top_ref.u[1] : &lcu->top_ref.v[1];
  const kvz_pixel *rec_ref = !color ? lcu->rec.y : (color == 1) ? lcu->rec.u : lcu->rec.v;

  // Init top borders pointer to point to the correct place in the correct reference array.
  const kvz_pixel *top_border;
  if (px.y) {
    top_border = &rec_ref[px.x + (px.y - 1) * (LCU_WIDTH >> is_chroma)];
  } else {
    top_border = &top_ref[px.x];
  }

  // Init left borders pointer to point to the correct place in the correct reference array.
  const kvz_pixel *left_border;
  int left_stride; // Distance between reference samples.
  if (px.x) {
    left_border = &rec_ref[px.x - 1 + px.y * (LCU_WIDTH >> is_chroma)];
    left_stride = LCU_WIDTH >> is_chroma;
  } else {
    left_border = &left_ref[px.y];
    left_stride = 1;
  }

  // Generate left reference.
  if (luma_px->x > 0) {
    // Get the number of reference pixels based on the PU coordinate within the LCU.
    int px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;

    // Limit the number of available pixels based on block size and dimensions
    // of the picture.
    px_available_left = MIN(px_available_left, width * 2);
    px_available_left = MIN(px_available_left, (pic_px->y - luma_px->y) >> is_chroma);

    // Copy pixels from coded CUs.
    for (int i = 0; i < px_available_left; ++i) {
      out_left_ref[i + 1] = left_border[i * left_stride];
    }
    // Extend the last pixel for the rest of the reference values.
    kvz_pixel nearest_pixel = out_left_ref[px_available_left];
    for (int i = px_available_left; i < width * 2; ++i) {
      out_left_ref[i + 1] = nearest_pixel;
    }
  } else {
    // If we are on the left edge, extend the first pixel of the top row.
    kvz_pixel nearest_pixel = luma_px->y > 0 ? top_border[0] : dc_val;
    for (int i = 0; i < width * 2; i++) {
      out_left_ref[i + 1] = nearest_pixel;
    }
  }

  // Generate top-left reference.
  if (luma_px->x > 0 && luma_px->y > 0) {
    // If the block is at an LCU border, the top-left must be copied from
    // the border that points to the LCUs 1D reference buffer.
    if (px.x == 0) {
      out_left_ref[0] = left_border[-1 * left_stride];
      out_top_ref[0] = left_border[-1 * left_stride];
    } else {
      out_left_ref[0] = top_border[-1];
      out_top_ref[0] = top_border[-1];
    }
  } else {
    // Copy reference clockwise.
    out_left_ref[0] = out_left_ref[1];
    out_top_ref[0] = out_left_ref[1];
  }

  // Generate top reference.
  if (luma_px->y > 0) {
    // Get the number of reference pixels based on the PU coordinate within the LCU.
    int px_available_top = num_ref_pixels_top[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;

    // Limit the number of available pixels based on block size and dimensions
    // of the picture.
    px_available_top = MIN(px_available_top, width * 2);
    px_available_top = MIN(px_available_top, (pic_px->x - luma_px->x) >> is_chroma);

    // Copy all the pixels we can.
    for (int i = 0; i < px_available_top; ++i) {
      out_top_ref[i + 1] = top_border[i];
    }
    // Extend the last pixel for the rest of the reference values.
    kvz_pixel nearest_pixel = top_border[px_available_top - 1];
    for (int i = px_available_top; i < width * 2; ++i) {
      out_top_ref[i + 1] = nearest_pixel;
    }
  } else {
    // Extend nearest pixel.
    kvz_pixel nearest_pixel = luma_px->x > 0 ? left_border[0] : dc_val;
    for (int i = 0; i < width * 2; i++) {
      out_top_ref[i + 1] = nearest_pixel;
    }
  }
}

void kvz_intra_build_reference_inner(
  const int_fast8_t log2_width,
  const color_t color,
  const vector2d_t *const luma_px,
  const vector2d_t *const pic_px,
  const lcu_t *const lcu,
  kvz_intra_references *const refs)
{
  assert(log2_width >= 2 && log2_width <= 5);

  refs->filtered_initialized = false;
  kvz_pixel * __restrict out_left_ref = &refs->ref.left[0];
  kvz_pixel * __restrict out_top_ref = &refs->ref.top[0];

  const int is_chroma = color != COLOR_Y ? 1 : 0;
  const int_fast8_t width = 1 << log2_width;

  // Convert luma coordinates to chroma coordinates for chroma.
  const vector2d_t lcu_px = {
    luma_px->x % LCU_WIDTH,
    luma_px->y % LCU_WIDTH
  };
  const vector2d_t px = {
    lcu_px.x >> is_chroma,
    lcu_px.y >> is_chroma,
  };

  // Init pointers to LCUs reconstruction buffers, such that index 0 refers to block coordinate 0.
  const kvz_pixel * __restrict left_ref = !color ? &lcu->left_ref.y[1] : (color == 1) ? &lcu->left_ref.u[1] : &lcu->left_ref.v[1];
  const kvz_pixel * __restrict top_ref = !color ? &lcu->top_ref.y[1] : (color == 1) ? &lcu->top_ref.u[1] : &lcu->top_ref.v[1];
  const kvz_pixel * __restrict rec_ref = !color ? lcu->rec.y : (color == 1) ? lcu->rec.u : lcu->rec.v;

  // Init top borders pointer to point to the correct place in the correct reference array.
  const kvz_pixel * __restrict top_border;
  if (px.y) {
    top_border = &rec_ref[px.x + (px.y - 1) * (LCU_WIDTH >> is_chroma)];
  } else {
    top_border = &top_ref[px.x];

  }

  // Init left borders pointer to point to the correct place in the correct reference array.
  const kvz_pixel * __restrict left_border;
  int left_stride; // Distance between reference samples.

  // Generate top-left reference.
  // If the block is at an LCU border, the top-left must be copied from
  // the border that points to the LCUs 1D reference buffer.
  if (px.x) {
    left_border = &rec_ref[px.x - 1 + px.y * (LCU_WIDTH >> is_chroma)];
    left_stride = LCU_WIDTH >> is_chroma;
    out_left_ref[0] = top_border[-1];
    out_top_ref[0] = top_border[-1];
  } else {
    left_border = &left_ref[px.y];
    left_stride = 1;
    out_left_ref[0] = left_border[-1 * left_stride];
    out_top_ref[0] = left_border[-1 * left_stride];
  }

  // Generate left reference.

  // Get the number of reference pixels based on the PU coordinate within the LCU.
  int px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;

  // Limit the number of available pixels based on block size and dimensions
  // of the picture.
  px_available_left = MIN(px_available_left, width * 2);
  px_available_left = MIN(px_available_left, (pic_px->y - luma_px->y) >> is_chroma);

  // Copy pixels from coded CUs.
  int i = 0;
  do {
    out_left_ref[i + 1] = left_border[(i + 0) * left_stride];
    out_left_ref[i + 2] = left_border[(i + 1) * left_stride];
    out_left_ref[i + 3] = left_border[(i + 2) * left_stride];
    out_left_ref[i + 4] = left_border[(i + 3) * left_stride];
    i += 4;
  } while (i < px_available_left);

  // Extend the last pixel for the rest of the reference values.
  kvz_pixel nearest_pixel = out_left_ref[i];
  for (; i < width * 2; i += 4) {
    out_left_ref[i + 1] = nearest_pixel;
    out_left_ref[i + 2] = nearest_pixel;
    out_left_ref[i + 3] = nearest_pixel;
    out_left_ref[i + 4] = nearest_pixel;
  }

  // Generate top reference.

  // Get the number of reference pixels based on the PU coordinate within the LCU.
  int px_available_top = num_ref_pixels_top[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;

  // Limit the number of available pixels based on block size and dimensions
  // of the picture.
  px_available_top = MIN(px_available_top, width * 2);
  px_available_top = MIN(px_available_top, (pic_px->x - luma_px->x) >> is_chroma);

  // Copy all the pixels we can.
  i = 0;
  do {
    memcpy(out_top_ref + i + 1, top_border + i, 4 * sizeof(kvz_pixel));
    i += 4;
  } while (i < px_available_top);

  // Extend the last pixel for the rest of the reference values.
  nearest_pixel = out_top_ref[i];
  for (; i < width * 2; i += 4) {
    out_top_ref[i + 1] = nearest_pixel;
    out_top_ref[i + 2] = nearest_pixel;
    out_top_ref[i + 3] = nearest_pixel;
    out_top_ref[i + 4] = nearest_pixel;
  }
}

void kvz_intra_build_reference(
  const int_fast8_t log2_width,
  const color_t color,
  const vector2d_t *const luma_px,
  const vector2d_t *const pic_px,
  const lcu_t *const lcu,
  kvz_intra_references *const refs)
{
  // Much logic can be discarded if not on the edge
  if (luma_px->x > 0 && luma_px->y > 0) {
    kvz_intra_build_reference_inner(log2_width, color, luma_px, pic_px, lcu, refs);
  } else {
    kvz_intra_build_reference_any(log2_width, color, luma_px, pic_px, lcu, refs);
  }
}

static void intra_recon_tb_leaf(
  encoder_state_t *const state,
  int x,
  int y,
  int depth,
  int8_t intra_mode,
  lcu_t *lcu,
  color_t color)
{
  const kvz_config *cfg = &state->encoder_control->cfg;
  const int shift = color == COLOR_Y ? 0 : 1;

  int log2width = LOG2_LCU_WIDTH - depth;
  if (color != COLOR_Y && depth < MAX_PU_DEPTH) {
    // Chroma width is half of luma width, when not at maximum depth.
    log2width -= 1;
  }
  const int width = 1 << log2width;
  const int lcu_width = LCU_WIDTH >> shift;

  const vector2d_t luma_px = { x, y };
  const vector2d_t pic_px = {
    state->tile->frame->width,
    state->tile->frame->height,
  };
  const vector2d_t lcu_px = { SUB_SCU(x) >> shift, SUB_SCU(y) >> shift};

  kvz_intra_references refs;
  kvz_intra_build_reference(log2width, color, &luma_px, &pic_px, lcu, &refs);

  kvz_pixel pred[32 * 32];
  const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm);
  kvz_intra_predict(&refs, log2width, intra_mode, color, pred, filter_boundary);

  const int index = lcu_px.x + lcu_px.y * lcu_width;
  kvz_pixel *block = NULL;
  switch (color) {
    case COLOR_Y:
      block = &lcu->rec.y[index];
      break;
    case COLOR_U:
      block = &lcu->rec.u[index];
      break;
    case COLOR_V:
      block = &lcu->rec.v[index];
      break;
  }
  kvz_pixels_blit(pred, block , width, width, width, lcu_width);
}

/**
 * \brief Reconstruct an intra CU
 *
 * \param state         encoder state
 * \param x             x-coordinate of the CU in luma pixels
 * \param y             y-coordinate of the CU in luma pixels
 * \param depth         depth in the CU tree
 * \param mode_luma     intra mode for luma, or -1 to skip luma recon
 * \param mode_chroma   intra mode for chroma, or -1 to skip chroma recon
 * \param cur_cu        pointer to the CU, or NULL to fetch CU from LCU
 * \param lcu           containing LCU
 */
void kvz_intra_recon_cu(
  encoder_state_t *const state,
  int x,
  int y,
  int depth,
  int8_t mode_luma,
  int8_t mode_chroma,
  cu_info_t *cur_cu,
  lcu_t *lcu)
{
  const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) };
  const int8_t width = LCU_WIDTH >> depth;
  if (cur_cu == NULL) {
    cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
  }

  // Reset CBFs because CBFs might have been set
  // for depth earlier
  if (mode_luma >= 0) {
    cbf_clear(&cur_cu->cbf, depth, COLOR_Y);
  }
  if (mode_chroma >= 0) {
    cbf_clear(&cur_cu->cbf, depth, COLOR_U);
    cbf_clear(&cur_cu->cbf, depth, COLOR_V);
  }

  if (depth == 0 || cur_cu->tr_depth > depth) {

    const int offset = width / 2;
    const int32_t x2 = x + offset;
    const int32_t y2 = y + offset;

    kvz_intra_recon_cu(state, x,  y,  depth + 1, mode_luma, mode_chroma, NULL, lcu);
    kvz_intra_recon_cu(state, x2, y,  depth + 1, mode_luma, mode_chroma, NULL, lcu);
    kvz_intra_recon_cu(state, x,  y2, depth + 1, mode_luma, mode_chroma, NULL, lcu);
    kvz_intra_recon_cu(state, x2, y2, depth + 1, mode_luma, mode_chroma, NULL, lcu);

    // Propagate coded block flags from child CUs to parent CU.
    uint16_t child_cbfs[3] = {
      LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y         )->cbf,
      LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset)->cbf,
      LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf,
    };

    if (mode_luma != -1 && depth <= MAX_DEPTH) {
      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_Y);
    }
    if (mode_chroma != -1 && depth <= MAX_DEPTH) {
      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_U);
      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_V);
    }
  } else {
    const bool has_luma = mode_luma != -1;
    const bool has_chroma = mode_chroma != -1 && x % 8 == 0 && y % 8 == 0;
    // Process a leaf TU.
    if (has_luma) {
      intra_recon_tb_leaf(state, x, y, depth, mode_luma, lcu, COLOR_Y);
    }
    if (has_chroma) {
      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, lcu, COLOR_U);
      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, lcu, COLOR_V);
    }

    kvz_quantize_lcu_residual(state, has_luma, has_chroma, x, y, depth, cur_cu, lcu, false);
  }
}
